LIBRERIAS

CARGA DE BASES

Nueva Corrección duplicados

df_audio_features <- df_audio_features_raw %>% 
  group_by(track_name, external_urls_spotify) %>% 
  mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
  ungroup() %>% 
  mutate(artist_key = sub(",|,.*", "", artist_all)) %>% 
  dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>% 
  distinct(artist_key, external_urls_spotify, .keep_all = T) %>% 
  as.data.frame()
Error in df_audio_features_raw %>% group_by(track_name, external_urls_spotify) %>%  : 
  could not find function "%>%"

CREACION cant_markets

Charts

VECTORES DE FEATURES

RIGTH JOIN audio_features Y charts

#Armamos un join para tener una tabla de charts con las caracteristicas de las canciones
# deberian quedar 22993 filas completas
join_audio_charts <- df_audio_features %>% 
  select("artist_name","artist_all","artist_key",
         "track_name", "external_urls_spotify", "album_name", "album_release_year",
         all_of(features_continuas), all_of(features_categoricas)) %>% 
  right_join( df_charts,# %>%
               by = c(
                 "track_name" = "Track_Name", 
                      "artist_key" ="Artist", 
                      "external_urls_spotify" = "URL"))

#HAY CHARTS QUE NO TIENEN FEATURES. HAY QUE TENERLO EN CUENTA PARA EL ANÁLISIS
library(mice)
md.pattern(join_audio_charts, rotate.names = TRUE)
popularidad[is.na(popularidad$indicador),]

#Agregación de todas las semanas en charts

HISTOGRAMAS Y BARPLOTS DE VARIABLES


##histograma de las variables continuas de audio_features

for (i in features_continuas){

  hist(df_audio_features[,i], main = paste("Histograma de", i, "(all data)"), xlab = i)
  abline(v = mean(df_audio_features[,i], na.rm = TRUE) , col="red")
  abline(v = median(df_audio_features[,i], na.rm = TRUE) , col="blue")
  legend("topright", legend = c("Media", "Mediana"), col=c("red", "blue"), lty =1)

}

#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')

features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')


##histograma de las variables continuas de charts
for (i in c(features_continuas, "Streams")){

  hist(join_audio_charts[,i], main = paste("Histograma de", i,  "(charts)"), xlab = i)
  abline(v = mean(join_audio_charts[,i], na.rm = TRUE) , col="red")
  abline(v = median(join_audio_charts[,i], na.rm = TRUE) , col="blue")

}

#divido features de charts según su distribución
audio_charts_continuas_media <- c('duration_ms', 'valence')

audio_charts_continuas_mediana <- c('danceability', 'acousticness', 'tempo', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets', "Streams")


##medidas resumen y barplots de las variables categoricas audio_features
for(i in features_categoricas){

  barplot(sort(table(df_audio_features[,i]),decreasing = T), las=2, 
          main = paste("Barplot de", i, "(all data)"))
  # pie(table(df_features_categoricos[,i]))
}



##medidas resumen y barplots de las variables categoricas join_audio_charts

for(i in features_categoricas){
  
  barplot(sort(table(join_audio_charts[,i]),decreasing = T), las=2, 
          main = paste("Barplot de", i, "(charts)"))
  # pie(table(df_features_categoricos[,i]))
}

Analisis de la variable markets_concat

#Hago un join al revés 

df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","markets_concat")]

join_barplot <- df_audio_features_tojoin %>% 
  select("track_name","artist_key","external_urls_spotify","markets_concat") %>% 
  left_join( df_chart_tojoin %>%
               select("Track_Name", "Artist", "URL","isinchart"),
               by = c(
                 "track_name" = "Track_Name", 
                      "artist_key" ="Artist", 
                      "external_urls_spotify" = "URL"))


join_barplot$isinchart[is.na(join_barplot$isinchart)] <- 0

join_barplot$isinchart <- factor(join_barplot$isinchart)

tabla_isinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==1,"markets_concat"], function(x) strsplit(as.character(x), ','))))

tabla_notinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==0,"markets_concat"], function(x) strsplit(as.character(x), ','))))

all_countries <- names(tabla_isinchart)

xlabs <- paste(paste(head(all_countries,3), collapse = ","),"...",paste(tail(all_countries,3),collapse = ","),"(ISO-Codes de Paises)",  collapse = ",")

options(scipen=999)
par(mfrow = c(1,2), las=1, mar=c(3,3,5,3), oma=c(0,1,1,1))
barplot(sort(tabla_isinchart, decreasing = TRUE), names.arg="", main ='En Charts',col=rgb(0.2,0.4,0.6,0.6),xlab = "Paises (ISO-Codes)")
# mtext(side = 1, text = xlabs, line = 1)
barplot(sort(tabla_notinchart, decreasing = TRUE), names.arg = "", main='Fuera de Charts',col=rgb(0.2,0.4,0.6,0.6), xlab = "Paises (ISO-Codes)")
mtext(side = 1, text = xlabs, line = 1, adj = 2)
mtext("Frecuencia de mercados habilitados", side = 3, line = -1, outer = TRUE, cex = 1.3, font =2 )
# mtext("Paises (ISO-Codes)", side = 3, line = -25, outer = TRUE)

CORRELACIONES

#correlaciones en audio features
x <- cor(df_audio_features[,c(features_continuas_media, features_continuas_mediana)],  use =  "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de audio_features", mar=c(0,0,1,0), method="number" ,number.cex=0.7)

#correlaciones en charts
x <- cor(scale(join_audio_charts[,c(audio_charts_continuas_media, audio_charts_continuas_mediana)]), use =  "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de los Charts", mar=c(0,0,1,0), method="number", number.cex=0.7 )

#chi2 test #con n grande no se puede usar este test
tabla_key_album <- table(df_audio_features$key_name, df_audio_features$album_type)
cat("Tabla de contigencia entre key y album type\n")
tabla_key_album
chisq.test(tabla_key_album)

SESGO DE VARIABLES

Boxplots Variables Numéricas sin filtrar outliers

#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')

features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')

all_features <- c(features_continuas_media, features_continuas_mediana)

par(mfrow=c(4,3))
for (feature in all_features){
  boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}

Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa

# "danceability,tempo,valence,acousticness,duration_ms,energy,instrumentalness,liveness,speechiness,cant_markets"

#sesgos d las variables                                                   
sort(apply(df_audio_features[,features_continuas], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))

variables_sesgo <- unlist(strsplit("acousticness,duration_ms,instrumentalness,liveness,speechiness,cant_markets,energy", ","))

df_sesgadas <- df_audio_features[,variables_sesgo]

logaritmo_ajustado = function(x,delta){
  if (x==0.0){
    return(log(0.00+delta, base = 10))
  }else{
    return(log(x, base = 10))
  }
}

delta <- 10^(-6)

df_sesgadas_log_adjust <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2), 
                                           function(x) logaritmo_ajustado(x,delta)))
# names(df_sesgadas_log_adjust) <- paste(names(df_sesgadas), "_log", sep="")
names(df_sesgadas_log_adjust) <- names(df_sesgadas)

df_datos <- cbind(df_sesgadas, df_sesgadas_log_adjust)



a <- df_sesgadas
b <- df_sesgadas_log_adjust
names(b) <- paste(names(df_sesgadas), "_log", sep="")
merged <- cbind(a,b)

merged <- merged[, order(names(merged))]

round(sort(apply(merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)})),2)

variables_plot <- unlist(strsplit("duration_ms", ","))
variables_plot <- append(variables_plot,paste(variables_plot,"_log", sep=""))
variables_plot <- variables_plot[order(variables_plot)]
plotear <- merged[,variables_plot]

par(mfrow = c(1,2))
for (col in names(plotear)){
  hist(plotear[,col], breaks="FD", main=col, xlab="")
}
summary(df_audio_features[,all_features])


hist(log(df_audio_features$duration_ms))

transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')

logaritmo_ajustado = function(x,delta){
  if (x<=0.0){
    return(log(0.00+delta, base = 10))
  }else{
    return(log(x, base = 10))
  }
}

delta <- 10^(-6)

par(mfrow=c(2,4))
for (feature in transformacion){
  hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
  hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}

inv_sqrt_ajustada = function(x, delta){
  if (x==0.0){
    return(1/sqrt(x+delta))
  }else{
    return(1/sqrt(x))
  }
}


delta <- 10^(-6)

par(mfrow=c(2,4))
for (feature in transformacion){
  hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
  hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}


par(mfrow=c(2,4))
for (feature in transformacion){
  hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
  hist(sqrt(df_audio_features[,feature]), main=paste(feature,"sqrt", sep="_"))
}

par(mfrow = c(2,1)) 
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)
fit <- lm(loudness~energy+acousticness, data=df_audio_features)

modelo <- fit$coefficients

df_audio_features$loudness_reg_imp <- df_audio_features$loudness

X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]

df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]

summary(df_audio_features[,c("loudness", "loudness_reg_imp")])

summary(fit)

instrumentalness tiene mucho sesgo la variable. Se va a recurrir a una logaritmización de la variable, previa transformación del dominio, haciendo que los valores que son 0, sean en realidad 0.0000001

logaritmo_ajustado = function(x,delta){
  if (x==0.0){
    return(log(x+delta, base = 10))
  }else{
    return(log(x, base = 10))
  }
}

delta <- 10^(-6)

df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))

par(mfrow =c(2,2))
hist(df_audio_features$instrumentalness, main="insrumentalness", xlab="")
hist(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main='instrumentalness_logadjust', ylim = c(0,130500), xlab = "")
boxplot(df_audio_features$instrumentalness, main="", horizontal = T)
boxplot(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main="", horizontal=T)
# hist(log(1/sqrt(df_audio_features$instrumentalness+0.00001)),main='log(sqrt(x+))', ylim=c(0,130500), xlab = "")

¿Es útil esta transformación?


delta <- 10^(-6)

df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))

df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust")]

join_histogram <- df_audio_features_tojoin %>% 
  dplyr::select("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust") %>% 
  left_join( df_chart_tojoin %>%
               select("Track_Name", "Artist", "URL","isinchart"),
               by = c(
                 "track_name" = "Track_Name", 
                      "artist_key" ="Artist", 
                      "external_urls_spotify" = "URL"))


join_histogram$isinchart[is.na(join_histogram$isinchart)] <- 0

join_histogram$isinchart <- factor(join_histogram$isinchart)


h11 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness'])
h11$density <-  h11$counts/sum(h11$counts)*100

h12 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness'])
h12$density <-  h12$counts/sum(h12$counts)*100

h21 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'])
h21$density <-  h21$counts/sum(h21$counts)*100

h22 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'])
h22$density <-  h22$counts/sum(h22$counts)*100

#png("C:/Users/Asus/Desktop/DATA SCIENCE/MAESTRIA/Data Mining/TP/graficos/instrumentalness.png",
#    width = 800, height = 800)
par(mfrow = c(3,2))
plot(h11, main='instrumentalness \nchart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h12, main='instrumentalness \nfuera chart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h21, main ="instrumentalness_log \nchart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h22, main ="instrumentalness_log \nfuera chart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
boxplot(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'], main="instrumentalness_log chart", horizontal = T)
boxplot(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'], main="instrumentalness_log fuera chart", horizontal = T)
#dev.off()

Z-Score de Variables que “tienden a la normal”


################################

## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'

##############################

#z-score para variables que tienden a la normal
#filtro features numericos 

#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]

#normalizo z score con las variables que tienden a la normal

zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
  name_col <- paste('zscore_',col, sep = "")
  zscore_cols <- append(zscore_cols, name_col)
  media <-  mean(df_audio_features_zscore_media[,col])
  stdv <- sd(df_audio_features_zscore_media[,col])
  df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
  }

par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))

Analisis de Z-Score por variable

Danceability

#variable: danceability

umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
  select(album_name,artist_name, danceability ) %>%
  arrange(-danceability)

Tempo

#variable: Tempo

umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
  select(album_name,artist_name, tempo ) %>%
  arrange(-tempo)

Valence

#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
  select(album_name,artist_name, valence ) %>%
  arrange(-valence)

Z-Score Modificado de Variables Asimetricas

################################

## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy',  'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'

##############################

features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')

df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]



zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
  name_col <- paste('zscoremodif_',col, sep = "")
  zscoremodif_cols <- append(zscoremodif_cols, name_col)
  med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
  MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
  df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}


par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))

Revisión Variable Instrumentalness

instrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness") 

x <- df_audio_features$instrumentalness

n_interv <- 10


intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)

labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
    
}

bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)

barplot(table(bins))

Hacemos K-means para poder discretizar la variable.

sse <- c()
for (k in 2:6){
  clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
  sse <- append(sse, clusters$tot.withinss)
}

plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')

#k=3 
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)

df_audio_features$clusters <- factor(clusters3$cluster)

lev <- levels(df_audio_features$clusters)

labs <- c()
for (i in lev){
  min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
  max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
  lab <- paste(min,max, sep=' - ')
  labs <- append(labs, lab)
}

labs

# barplot(table(factor(clusters3$cluster)), labels = labs)

Preguntas de investigacion

Patron Comun Canciones del Chart

¿Qué características tienen las canciones que están en el chart? ¿Cual es el patrón comun que tienen las canciones más escuchadas? (ver dispersiones, media, grafico comparativo)



#funcion para escalar variable
scale_vble <- function(x){
  (x - mean(x, na.rm = T))/sd(x, na.rm = T)
}
#anti_join
anti_join_audio_charts <- df_audio_features %>% 
  select("artist_name","artist_all", "artist_key",
         "track_name", "external_urls_spotify", "album_name", "album_release_year",
         all_of(features_continuas), all_of(features_categoricas)) %>% 
  anti_join( df_charts %>%
               select( "Track_Name", "Artist", "URL"),
               by = c("external_urls_spotify" ="URL",
                      "artist_key" ="Artist"  ))
               # by = c("track_name" = "Track_Name"))


anti_join_audio_charts_complete <- na.omit(anti_join_audio_charts)
anti_join_audio_charts_complete_scale <- anti_join_audio_charts_complete %>% 
  distinct() %>% 
  select(features_continuas)  %>% 
  mutate_all(scale_vble)
nrow(anti_join_audio_charts_complete_scale)

Qué temas perduran mucho en el ranking

Artistas que mas aparecen en el chart

join_audio_charts %>% 
  group_by(artist_name) %>% 
  dplyr::summarise(n = n()) %>% 
  arrange(-n)

Tracks que mas aparecen en el chart

join_audio_charts %>% 
  group_by(track_name, artist_name,external_urls_spotify) %>% 
  dplyr::summarise(n = n()) %>% 
  arrange(-n) %>% 
  select(track_name, n, everything(.))

¿Cuánto tiempo están en un chart?

# cantidad de semanas que estuvieron en el chart

df_charts %>% 
  mutate(week_start=as.Date(week_start),
         week_end = as.Date(week_end),
         week_year = (year(week_start))) %>%
  arrange(Artist, Track_Name) %>% 
  group_by(Artist, Track_Name, URL) %>% 
 dplyr:: summarise( day_in = min(week_start),
             year_in = year(day_in),
             day_max = max(week_end),
             year_max = year(day_max),
             duracion_chart_dias = day_max-day_in,
             duracion_chart_anio = year_max - year_in) %>% 
  arrange(Artist)

#prueba igal de transformacion y test de normalidad

for (i in features_continuas){
   x <- log10(df_chart_w_lyrics[,i])
   x <- shapiro.test(x)
   z <- x$p.value
  print(z)
  }
[1] 1.85241e-21
[1] 1.167246e-23
[1] 7.44622e-11
[1] 1.256851e-30
[1] NaN
[1] 3.287572e-17
NaNs producedError in shapiro.test(x) : sample size must be between 3 and 5000
---
title: "R Notebook"
output: html_notebook
---

# LIBRERIAS
```{r, echo=FALSE, warning=FALSE}
library(ggplot2)
library(tidyverse)
library(readxl)
library(sqldf)
library(lubridate)
library(dplyr)
```

# CARGA DE BASES 
```{r}
df_artist <- read.csv("data/df_artist_sin_duplicados.csv")
df_charts_raw <- read.csv("data/df_charts_sin_duplicados.csv")
df_audio_features_raw <- read.csv("data/audio_features_plano_sin_duplicados.csv")
df_lyrics <- read.csv("data/df_lyrics.csv")

```


## Nueva Corrección duplicados
```{r}
# DF listo para el join con chrats
df_audio_features <- df_audio_features_raw %>% 
  group_by(track_name, external_urls_spotify) %>% 
  mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
  ungroup() %>% 
  mutate(artist_key = sub(",|,.*", "", artist_all)) %>% 
  dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>% 
  distinct(artist_key, external_urls_spotify, .keep_all = T) %>% 
  as.data.frame()
```

# CREACION `cant_markets`
```{r}
contar_market <- function(x){
q <- length(unlist(strsplit(x, split = ",")))
return (q)
  }
df_audio_features$cant_markets <- sapply(df_audio_features[,"markets_concat"], contar_market)
```

# Charts
```{r}
#metrica de popularidad
df_charts <- df_charts_raw %>% 
  group_by(Artist, Track_Name, URL) %>%
  dplyr:: summarise(semanas_sum = n(),
            streams_sum = (sum(Streams, na.rm = T)/10^6 ),
            streams_min = (min(Streams)/10^6 ),
            streams_max = (max(Streams)/10^6 ),
            position_avg = mean(Position, na.rm = T),
            position_min = min(Position), 
            position_max = max(Position)) %>% 
  ungroup() %>% 
  mutate(indicador = as.numeric(streams_sum*semanas_sum/position_avg) )

```


# VECTORES DE FEATURES
```{r}
#features var continuos
features_continuas <- c('acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',   'tempo', 'valence', 'cant_markets')

#features var_ categóricas
features_categoricas <- c('explicit', 'key_name', 'mode_name', "key_mode", "album_type")

```



# RIGTH JOIN `audio_features` Y `charts`
```{r}
#Armamos un join para tener una tabla de charts con las caracteristicas de las canciones
# deberian quedar 22993 filas completas
join_audio_charts <- df_audio_features %>% 
  select("artist_name","artist_all","artist_key",
         "track_name", "external_urls_spotify", "album_name", "album_release_year",
         all_of(features_continuas), all_of(features_categoricas)) %>% 
  right_join( df_charts,# %>%
               by = c(
                 "track_name" = "Track_Name", 
                      "artist_key" ="Artist", 
                      "external_urls_spotify" = "URL"))

#HAY CHARTS QUE NO TIENEN FEATURES. HAY QUE TENERLO EN CUENTA PARA EL ANÁLISIS
library(mice)
md.pattern(join_audio_charts, rotate.names = TRUE)
popularidad[is.na(popularidad$indicador),]

```



#Agregación de todas las semanas en charts
```{r, warning=FALSE}

features_continuas <- c('acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',   'tempo', 'valence', 'cant_markets')

features_categoricas <- c('explicit', 'key_name', 'mode_name', "key_mode", "album_type")

groupping_cols <- c("artist_name","artist_all","artist_key","track_name","external_urls_spotify","album_name","album_release_year")

numeric_col_charts <- c("Position","Streams")

week_start <- c("week_start")

chart_group <- join_audio_charts %>% 
                group_by(artist_name,artist_all,artist_key,track_name,external_urls_spotify,album_name,album_release_year)


continuas_summarized = chart_group %>% summarise_at(features_continuas, mean, na.rm = TRUE)
categoricas_summarizes = chart_group %>% summarise_at(features_categoricas, first)
numeric_charts_summarizes = chart_group %>% summarise(across(numeric_col_charts, list(min=min,max=max,avg=mean)))
cant_semanas = chart_group %>% summarise_at(week_start, n_distinct)
names(cant_semanas$week_start) <- "cant_semanas"

aggregation_df <- cbind(numeric_charts_summarizes, cant_semanas[,-c(1:7)],continuas_summarized[,-c(1:7)], categoricas_summarizes[,-c(1:7)])

names(aggregation_df)[names(aggregation_df) == 'week_start'] <- "cant_semanas"

cols <- names(aggregation_df)
numeric_cols <- cols[sapply(aggregation_df,is.numeric)]

summary(aggregation_df[,numeric_cols[2:length(numeric_cols)]])


```


```{r}
df_lyrics_unicas <- df_lyrics %>% distinct(artist_name, track_name, lyrics)
nrow(df_lyrics_unicas)

df_chart_w_lyrics <- merge(join_audio_charts, df_lyrics_unicas, by.x = c("artist_name","track_name"), by.y= c("artist_name","track_name"), all.x=TRUE, all.y = FALSE)

df_chart_w_lyrics <- df_chart_w_lyrics[!is.na(df_chart_w_lyrics$lyrics),]

```



# HISTOGRAMAS Y BARPLOTS DE VARIABLES
```{r}

##histograma de las variables continuas de audio_features

for (i in features_continuas){

  hist(df_audio_features[,i], main = paste("Histograma de", i, "(all data)"), xlab = i)
  abline(v = mean(df_audio_features[,i], na.rm = TRUE) , col="red")
  abline(v = median(df_audio_features[,i], na.rm = TRUE) , col="blue")
  legend("topright", legend = c("Media", "Mediana"), col=c("red", "blue"), lty =1)

}

#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')

features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')


##histograma de las variables continuas de charts
for (i in c(features_continuas, "Streams")){

  hist(join_audio_charts[,i], main = paste("Histograma de", i,  "(charts)"), xlab = i)
  abline(v = mean(join_audio_charts[,i], na.rm = TRUE) , col="red")
  abline(v = median(join_audio_charts[,i], na.rm = TRUE) , col="blue")

}

#divido features de charts según su distribución
audio_charts_continuas_media <- c('duration_ms', 'valence')

audio_charts_continuas_mediana <- c('danceability', 'acousticness', 'tempo', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets', "Streams")


##medidas resumen y barplots de las variables categoricas audio_features
for(i in features_categoricas){

  barplot(sort(table(df_audio_features[,i]),decreasing = T), las=2, 
          main = paste("Barplot de", i, "(all data)"))
  # pie(table(df_features_categoricos[,i]))
}



##medidas resumen y barplots de las variables categoricas join_audio_charts

for(i in features_categoricas){
  
  barplot(sort(table(join_audio_charts[,i]),decreasing = T), las=2, 
          main = paste("Barplot de", i, "(charts)"))
  # pie(table(df_features_categoricos[,i]))
}


```

## Analisis de la variable `markets_concat`

```{r}
#Hago un join al revés 

df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","markets_concat")]

join_barplot <- df_audio_features_tojoin %>% 
  select("track_name","artist_key","external_urls_spotify","markets_concat") %>% 
  left_join( df_chart_tojoin %>%
               select("Track_Name", "Artist", "URL","isinchart"),
               by = c(
                 "track_name" = "Track_Name", 
                      "artist_key" ="Artist", 
                      "external_urls_spotify" = "URL"))


join_barplot$isinchart[is.na(join_barplot$isinchart)] <- 0

join_barplot$isinchart <- factor(join_barplot$isinchart)

tabla_isinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==1,"markets_concat"], function(x) strsplit(as.character(x), ','))))

tabla_notinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==0,"markets_concat"], function(x) strsplit(as.character(x), ','))))

all_countries <- names(tabla_isinchart)

xlabs <- paste(paste(head(all_countries,3), collapse = ","),"...",paste(tail(all_countries,3),collapse = ","),"(ISO-Codes de Paises)",  collapse = ",")

options(scipen=999)
par(mfrow = c(1,2), las=1, mar=c(3,3,5,3), oma=c(0,1,1,1))
barplot(sort(tabla_isinchart, decreasing = TRUE), names.arg="", main ='En Charts',col=rgb(0.2,0.4,0.6,0.6),xlab = "Paises (ISO-Codes)")
# mtext(side = 1, text = xlabs, line = 1)
barplot(sort(tabla_notinchart, decreasing = TRUE), names.arg = "", main='Fuera de Charts',col=rgb(0.2,0.4,0.6,0.6), xlab = "Paises (ISO-Codes)")
mtext(side = 1, text = xlabs, line = 1, adj = 2)
mtext("Frecuencia de mercados habilitados", side = 3, line = -1, outer = TRUE, cex = 1.3, font =2 )
# mtext("Paises (ISO-Codes)", side = 3, line = -25, outer = TRUE)
```


# CORRELACIONES
```{r}
#correlaciones en audio features
x <- cor(df_audio_features[,c(features_continuas_media, features_continuas_mediana)],  use =  "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de audio_features", mar=c(0,0,1,0), method="number" ,number.cex=0.7)
```


```{r}

#correlaciones en charts
x <- cor(scale(join_audio_charts[,c(audio_charts_continuas_media, audio_charts_continuas_mediana)]), use =  "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de los Charts", mar=c(0,0,1,0), method="number", number.cex=0.7 )
```

```{r}

#chi2 test #con n grande no se puede usar este test
tabla_key_album <- table(df_audio_features$key_name, df_audio_features$album_type)
cat("Tabla de contigencia entre key y album type\n")
tabla_key_album
chisq.test(tabla_key_album)
```


# SESGO DE VARIABLES 

## Boxplots Variables Numéricas sin filtrar outliers
```{r}
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')

features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')

all_features <- c(features_continuas_media, features_continuas_mediana)

par(mfrow=c(4,3))
for (feature in all_features){
  boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}
```

Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa 


```{r}
# "danceability,tempo,valence,acousticness,duration_ms,energy,instrumentalness,liveness,speechiness,cant_markets"

#sesgos d las variables                                                   
sort(apply(df_audio_features[,features_continuas], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))

variables_sesgo <- unlist(strsplit("acousticness,duration_ms,instrumentalness,liveness,speechiness,cant_markets,energy", ","))

df_sesgadas <- df_audio_features[,variables_sesgo]

logaritmo_ajustado = function(x,delta){
  if (x==0.0){
    return(log(0.00+delta, base = 10))
  }else{
    return(log(x, base = 10))
  }
}

delta <- 10^(-6)

df_sesgadas_log_adjust <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2), 
                                           function(x) logaritmo_ajustado(x,delta)))
# names(df_sesgadas_log_adjust) <- paste(names(df_sesgadas), "_log", sep="")
names(df_sesgadas_log_adjust) <- names(df_sesgadas)

df_datos <- cbind(df_sesgadas, df_sesgadas_log_adjust)



a <- df_sesgadas
b <- df_sesgadas_log_adjust
names(b) <- paste(names(df_sesgadas), "_log", sep="")
merged <- cbind(a,b)

merged <- merged[, order(names(merged))]

round(sort(apply(merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)})),2)

```


```{r}

variables_plot <- unlist(strsplit("duration_ms", ","))
variables_plot <- append(variables_plot,paste(variables_plot,"_log", sep=""))
variables_plot <- variables_plot[order(variables_plot)]
plotear <- merged[,variables_plot]

par(mfrow = c(1,2))
for (col in names(plotear)){
  hist(plotear[,col], breaks="FD", main=col, xlab="")
}

```



```{r}
summary(df_audio_features[,all_features])
```
```{r}


hist(log(df_audio_features$duration_ms))

```




```{r}

transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')

logaritmo_ajustado = function(x,delta){
  if (x<=0.0){
    return(log(0.00+delta, base = 10))
  }else{
    return(log(x, base = 10))
  }
}

delta <- 10^(-6)

par(mfrow=c(2,4))
for (feature in transformacion){
  hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
  hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}
```

```{r}

inv_sqrt_ajustada = function(x, delta){
  if (x==0.0){
    return(1/sqrt(x+delta))
  }else{
    return(1/sqrt(x))
  }
}


delta <- 10^(-6)

par(mfrow=c(2,4))
for (feature in transformacion){
  hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
  hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}



```


```{r}


par(mfrow=c(2,4))
for (feature in transformacion){
  hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
  hist(sqrt(df_audio_features[,feature]), main=paste(feature,"sqrt", sep="_"))
}

```



```{r}

par(mfrow = c(2,1)) 
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)





```

```{r}
fit <- lm(loudness~energy+acousticness, data=df_audio_features)

modelo <- fit$coefficients

df_audio_features$loudness_reg_imp <- df_audio_features$loudness

X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]

df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]

summary(df_audio_features[,c("loudness", "loudness_reg_imp")])

summary(fit)
```



`instrumentalness` tiene mucho sesgo la variable. Se va a recurrir a una logaritmización de la variable, previa transformación del dominio, haciendo que los valores que son 0, sean en realidad 0.0000001  

```{r}
logaritmo_ajustado = function(x,delta){
  if (x==0.0){
    return(log(x+delta, base = 10))
  }else{
    return(log(x, base = 10))
  }
}

delta <- 10^(-6)

df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))

par(mfrow =c(2,2))
hist(df_audio_features$instrumentalness, main="insrumentalness", xlab="")
hist(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main='instrumentalness_logadjust', ylim = c(0,130500), xlab = "")
boxplot(df_audio_features$instrumentalness, main="", horizontal = T)
boxplot(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main="", horizontal=T)
# hist(log(1/sqrt(df_audio_features$instrumentalness+0.00001)),main='log(sqrt(x+))', ylim=c(0,130500), xlab = "")

```

¿Es útil esta transformación? 

```{r}

delta <- 10^(-6)

df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))

df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust")]

join_histogram <- df_audio_features_tojoin %>% 
  dplyr::select("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust") %>% 
  left_join( df_chart_tojoin %>%
               select("Track_Name", "Artist", "URL","isinchart"),
               by = c(
                 "track_name" = "Track_Name", 
                      "artist_key" ="Artist", 
                      "external_urls_spotify" = "URL"))


join_histogram$isinchart[is.na(join_histogram$isinchart)] <- 0

join_histogram$isinchart <- factor(join_histogram$isinchart)


h11 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness'])
h11$density <-  h11$counts/sum(h11$counts)*100

h12 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness'])
h12$density <-  h12$counts/sum(h12$counts)*100

h21 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'])
h21$density <-  h21$counts/sum(h21$counts)*100

h22 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'])
h22$density <-  h22$counts/sum(h22$counts)*100

#png("C:/Users/Asus/Desktop/DATA SCIENCE/MAESTRIA/Data Mining/TP/graficos/instrumentalness.png",
#    width = 800, height = 800)
par(mfrow = c(3,2))
plot(h11, main='instrumentalness \nchart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h12, main='instrumentalness \nfuera chart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h21, main ="instrumentalness_log \nchart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h22, main ="instrumentalness_log \nfuera chart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
boxplot(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'], main="instrumentalness_log chart", horizontal = T)
boxplot(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'], main="instrumentalness_log fuera chart", horizontal = T)
#dev.off()

```



### Z-Score de Variables que "tienden a la normal"
```{r}

################################

## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'

##############################

#z-score para variables que tienden a la normal
#filtro features numericos 

#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]

#normalizo z score con las variables que tienden a la normal

zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
  name_col <- paste('zscore_',col, sep = "")
  zscore_cols <- append(zscore_cols, name_col)
  media <-  mean(df_audio_features_zscore_media[,col])
  stdv <- sd(df_audio_features_zscore_media[,col])
  df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
  }

par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))
```

### Analisis de Z-Score por variable
 
#### Danceability

```{r}
#variable: danceability

umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
  select(album_name,artist_name, danceability ) %>%
  arrange(-danceability)
```

#### Tempo

```{r}
#variable: Tempo

umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
  select(album_name,artist_name, tempo ) %>%
  arrange(-tempo)
```

#### Valence

```{r}
#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
  select(album_name,artist_name, valence ) %>%
  arrange(-valence)
```

### Z-Score Modificado de Variables Asimetricas

```{r}
################################

## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy',  'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'

##############################

features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')

df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]



zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
  name_col <- paste('zscoremodif_',col, sep = "")
  zscoremodif_cols <- append(zscoremodif_cols, name_col)
  med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
  MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
  df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}


par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))

```


#### Revisión Variable `Instrumentalness`
```{r}
instrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness") 

x <- df_audio_features$instrumentalness

n_interv <- 10


intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)

labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
    
}

bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)

barplot(table(bins))

```

Hacemos K-means para poder discretizar la variable. 

```{r}
sse <- c()
for (k in 2:6){
  clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
  sse <- append(sse, clusters$tot.withinss)
}

plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')

#k=3 
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)

df_audio_features$clusters <- factor(clusters3$cluster)

lev <- levels(df_audio_features$clusters)

labs <- c()
for (i in lev){
  min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
  max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
  lab <- paste(min,max, sep=' - ')
  labs <- append(labs, lab)
}

labs

# barplot(table(factor(clusters3$cluster)), labels = labs)



```


# Preguntas de investigacion

## Patron Comun Canciones del Chart
¿Qué características tienen las canciones que están en el chart? ¿Cual es el patrón comun que tienen las canciones más escuchadas? (ver dispersiones, media, grafico comparativo)
```{r}


#funcion para escalar variable
scale_vble <- function(x){
  (x - mean(x, na.rm = T))/sd(x, na.rm = T)
}

```
```{r}
#anti_join
anti_join_audio_charts <- df_audio_features %>% 
  select("artist_name","artist_all", "artist_key",
         "track_name", "external_urls_spotify", "album_name", "album_release_year",
         all_of(features_continuas), all_of(features_categoricas)) %>% 
  anti_join( df_charts %>%
               select( "Track_Name", "Artist", "URL"),
               by = c("external_urls_spotify" ="URL",
                      "artist_key" ="Artist"  ))
               # by = c("track_name" = "Track_Name"))


anti_join_audio_charts_complete <- na.omit(anti_join_audio_charts)
anti_join_audio_charts_complete_scale <- anti_join_audio_charts_complete %>% 
  distinct() %>% 
  select(features_continuas)  %>% 
  mutate_all(scale_vble)
nrow(anti_join_audio_charts_complete_scale)

```

## Qué temas perduran mucho en el ranking

### Artistas que mas aparecen en el chart
```{r}
join_audio_charts %>% 
  group_by(artist_name) %>% 
  dplyr::summarise(n = n()) %>% 
  arrange(-n)
```

### Tracks que mas aparecen en el chart
```{r}
join_audio_charts %>% 
  group_by(track_name, artist_name,external_urls_spotify) %>% 
  dplyr::summarise(n = n()) %>% 
  arrange(-n) %>% 
  select(track_name, n, everything(.))

```


# ¿Cuánto tiempo están en un chart? 

```{r}
# cantidad de semanas que estuvieron en el chart

df_charts %>% 
  mutate(week_start=as.Date(week_start),
         week_end = as.Date(week_end),
         week_year = (year(week_start))) %>%
  arrange(Artist, Track_Name) %>% 
  group_by(Artist, Track_Name, URL) %>% 
 dplyr:: summarise( day_in = min(week_start),
             year_in = year(day_in),
             day_max = max(week_end),
             year_max = year(day_max),
             duracion_chart_dias = day_max-day_in,
             duracion_chart_anio = year_max - year_in) %>% 
  arrange(Artist)

```

#prueba igal de transformacion y test de normalidad
```{r}
join_audio_charts[1:5,"acousticness"]^2

library(nortest)

log10(df_chart_w_lyrics$acousticness)

for (i in features_continuas){
   x <- log10(df_chart_w_lyrics[,i])
   x <- shapiro.test(x)
   z <- x$p.value
  print(z)
  }


```




